Libraries

library(librarian)
librarian::shelf(tidyverse, broom,
                 ggplot2, ggtext, ggthemes, RColorBrewer,
                 Seurat, SeuratDisk, SingleCellExperiment, mclust, slingshot,
                 quiet = TRUE)

Set working directory:

if (Sys.getenv("RSTUDIO")==1) {
   # setwd to where the editor is, if the IDE is RStudio
  setwd( dirname(rstudioapi::getSourceEditorContext(id = NULL)$path) )
} else {
  # setwd to where the editor is in a general way - maybe less failsafe than the previous
  setwd(here::here())
  # the following checks that the latter went well, but assuming
  # that the user has not changed the name of the repo
  d <- str_split(getwd(), fsep)[[1]][length(str_split(getwd(), fsep)[[1]])]
  if (d != 'Puigetal2023_bioinformatics_scripts') { stop(
    paste0("Could not set working directory automatically to where this",
           " script resides.\nPlease do `setwd()` manually"))
    }
}

To save images outside the repo (to reduce size):

figdir <- paste0(c(head(str_split(getwd(), fsep)[[1]],-1),
                   paste0(tail(str_split(getwd(), fsep)[[1]],1), '_figures')),
                 collapse = fsep)
dir.create(figdir, showWarnings = FALSE)

1 Prepare the dataset

Load data for ISC/EB, EE and pECs

We start with the integrated scRNAseq dataset saved as a Seurat object in script #5, to explore the expression ‘trajectory’ along the differentiation of ISCs into pECs and EE cells. We start with converting the SeuratObject into a SingleCellExperiment object (which can be used by the slingshot library), and filtering the dataset to keep only the cell types of interest:

outpath <- file.path(getwd(), "output")
alldata <- readRDS(file.path(outpath, "gut_int_annot.rds"))
# set up the assay with the integrated expression values
DefaultAssay(alldata) <- "CCA"
# name individual cells by their sequence identifier
alldata@meta.data[["cell"]] <- alldata@assays[["CCA"]]@data@Dimnames[[2]]
# create SCE object and filter by cells of the PMG
if (file.exists( file.path(outpath, "trajectory-sce.rds") )) {
  sce <- readRDS(file.path(outpath, "trajectory-sce.rds"))
} else {
  sce <- as.SingleCellExperiment(alldata)
  rm('alldata')
  keep_celltypes <- c("intestinal stem cell / enteroblast", "enteroendocrine cell",
                      "enterocyte of posterior midgut epithelium")
  keep_cells <- colData(sce) %>%
    as.data.frame() %>%
    dplyr::filter(integrated_celltype %in% keep_celltypes) %>%
    pull(cell)
  sce <- sce[, keep_cells]
  # cache
  saveRDS(sce, file.path(outpath, "trajectory-sce.rds"))
}
as.data.frame.table(table(colData(sce)$integrated_celltype)) %>%
  dplyr::rename(celltype=Var1, ncells=Freq) %>%
  knitr::kable()
celltype ncells
enterocyte of posterior midgut epithelium 2033
enteroendocrine cell 917
intestinal stem cell / enteroblast 1309

Note: here we are selecting the integrated_celltype classification from script #5 rather than the specific_celltype—the one where ISCs and EBs are differentiataed. Differentiating between ISCs and EBs is robust enough to see some differential gene expresion but not enough that we can define well a trajectory that includes EBs as a transient state towards EC and not towards EE.<> s ## Load state markers

We did several tests to define the best list of genes that work as markers for each cell type we are interested in. We performed a differential expression analysis using MAST test only of different types of enterocytes, stem cells and enteroendocrine cells. We selected the 100 markers with smallest p-value for each cell type, and combined them with a list of previously known markers:

Cell type Markers
ISC Dl, Smvt, sna, polo, stf, cnn
EB klu, E(spl)m3-HLH, E(spl)malpha-BFM, E(spl)mbeta-HLH
EC Myo31DF, nub, alphaTry, betaTry
EE pros, AstaA, AstaC, Mip, NPF, CCHa1, CCHa2, Orcokinin, Tk, Dh31

Get the 400 best markers for EE, pEC, EB and ISC types:

# get all markers per cell type (EE, pEC, aEC, EB, ISC)
markerfiles <- list.files(path=file.path(outpath, "markers-mast"),
                          pattern="*.csv", full.names=FALSE)
# remove ECs form anterior midgut - will not be used
markerfiles <- markerfiles[ !str_detect(markerfiles, 'anterior') ]
markers <- tibble(filename = markerfiles) %>%
  mutate(contents = purrr::map(file.path(outpath, "markers-mast", markerfiles),
                        ~ read_csv(.))) %>%
  unnest(cols = c(contents)) %>%
  rowwise %>% mutate(cell_type = str_remove(filename, "_mast.csv")) %>%
  mutate(cell_type = case_when(
    cell_type == 'stem_cell'            ~ 'intestinal stem cell',
    cell_type == 'enterocyte_posterior' ~ 'enterocyte (pmg)',
    TRUE                                ~ cell_type))
# keep the top 100
keep_markers <- markers %>%
  group_by(filename) %>%
  # they are already in ascending order
  top_n(-100,p_val_adj) %>%
  pull(gene)
# check
length(keep_markers); head(keep_markers)
## [1] 400
## [1] "E(spl)mbeta-HLH" "N"               "robo2"           "Tet"            
## [5] "LanA"            "E(spl)m3-HLH"

We add now 4 markers from the manual list:

isc <- c("Dl", "Smvt", "sna", "polo", "stf", "cnn")
eb  <- c("klu", "E(spl)m3-HLH", "E(spl)malpha-BFM", "E(spl)mbeta-HLH")
ec  <- c("Myo31DF", "nub", "alphaTry", "betaTry", "lambdaTry")
ee  <- c("pros", "AstaA", "AstaC", "Mip", "NPF", "CCHa1", "CCHa2", "Orcokinin", "Tk", "Dh31")
keep_markers <- unique(keep_markers, c(isc, eb, ec, ee))
length(keep_markers)
## [1] 304

2 Establish pseudotime trajectories

Now we are in a position to use the slingshot algorithm. This uses Euclidean distances in the construction of ‘pseudotime’, assuming that cells that have transcriptional similarities will be close to each other when in a reduced dimension. The first step of the pseudotime inference is to perform PCA to reduce the dimensionality of the dataset—including only the selected marker genes.

We then group the cells to determine the structure of lineages. Unlike clustering for annotation, which aims to identify cell types, here we want to find branching events and their location. This is done with the mclust package, which uses automated Gaussian mixture modeling.

# read cached results if available
if (file.exists( file.path(outpath, "trajectory-PCA.rds") )) {
  pcadat <- readRDS( file.path(outpath, "trajectory-PCA.rds") )
  } else {
  # calculate trajectories
  ## calculate PCs
  pca1 <- prcomp(t(assays(sce)$logcounts[keep_markers,]), scale=TRUE)
  ## store as df
  pcadat <- pca1$x %>%
    as.data.frame() %>%
    tibble::rownames_to_column("cell") %>%
    left_join(colData(sce) %>% as.data.frame(), by="cell")
  ## reduce dimensions to 'n' components
  ncomponents <- 2
  reducedDims(sce) <- SimpleList(PCA=pca1$x[, 1:ncomponents])
  ## find clusters with GMM
  cl1 <- Mclust(pca1$x[, 1:ncomponents], verbose=FALSE)
  colData(sce)$GMM <- as.factor(cl1$classification)
  pcadat$GMM <- as.factor(cl1$classification)
  ## cache
  saveRDS(pcadat, file.path(outpath, "trajectory-PCA.rds"))
}
# check
pal <- c('#009E73', '#CC79A7', '#E6C31D')
palalpha <- c('#009E7355', '#CC79A755', '#E6C31D55')
ggplot(pcadat, aes(PC1, PC2)) +
  geom_point(aes(fill = factor(integrated_celltype), colour = factor(integrated_celltype)),
             stroke=0.6, shape=21) +
  scale_fill_manual(name='', values = palalpha) +
  scale_colour_manual(name='', values = pal)

From this we can perform the pseudotime analysis. The Slingshot algorithm identifies the global lineage structure with a minimum spanning tree-based clustering and then fits simultaneous principal curves to describe each lineage.

# load trajectories
if (file.exists(file.path(outpath, "slingshot-trajectory.rds"))) {
  sl1 <- readRDS(file.path(outpath, "slingshot-trajectory.rds"))
  } else {
  # calculate
  sl1 <- slingshot(sce, clusterLabels="integrated_celltype", reducedDim="PCA",
                   start.clus="intestinal stem cell / enteroblast", 
                   end.clus=c("enteroendocrine cell","enterocyte of posterior midgut epithelium"))
  # cache
  saveRDS(sl1, file.path(outpath, "slingshot-trajectory.rds"))
}
# plot origin/destination of trajectories
pointcolors <- palalpha[as.factor(sce$integrated_celltype)]
plot(reducedDims(sce)$PCA, col = pointcolors, pch=16, asp = 1, cex=0.5)
lines(SlingshotDataSet(sl1), lwd=2, type = 'lineages', col = '#00000077')
legendlabels = unique(as.factor(sce$integrated_celltype))
legendcolors = unique(pointcolors)
legend("bottomleft", pch=16, legend=legendlabels, col=legendcolors, cex=0.8, bty='n')

We can see an ordering according to what we expected. Here we obtain two branches, one from intestinal stem cell to enteroendocrine cells and other to the enterocyte of posterior midgut epithelium. And we can now make a more refined representation of the expression trajectories:

# extract cell type, PCA position and pseudotime value per cell
pseudotimes <- data.frame(
  cell=colnames(sl1),
  pseudotime1=sl1$slingPseudotime_1, pseudotime2=sl1$slingPseudotime_2,
  PC1=reducedDims(sl1)$PCA[, "PC1"], PC2=reducedDims(sl1)$PCA[, "PC2"],
  integrated_celltype=sl1$integrated_celltype)
# get PCA curves of pseudotime trajectory
sshot <- SlingshotDataSet(sl1)
curve1 <- data.frame(sshot@curves[[1]]$s[sshot@curves[[1]]$ord,]) # Lineage1
curve2 <- data.frame(sshot@curves[[2]]$s[sshot@curves[[2]]$ord,]) # Lineage2
# plot
p <- ggplot(pseudotimes, aes(PC1, PC2)) +
  geom_point(aes(fill = factor(integrated_celltype),
                 colour = factor(integrated_celltype)),
             stroke=0.6, shape=21) +
  scale_fill_manual(name='', values = palalpha) +
  scale_colour_manual(name='', values = pal) +
  geom_path(data=curve1, col="black", lwd=1, linetype="solid") +
  geom_path(data=curve2, col="black", lwd=1, linetype="solid")
p

suppressMessages(
  ggsave('trajectories.pdf', plot = last_plot(), device = 'pdf', path = figdir, dpi = 300)
)

And of the pseudotime value of each cell:

pcamarkers <- data.frame(pseudotime1=sl1$slingPseudotime_1,
                        pseudotime2=sl1$slingPseudotime_2,
                        PC1=reducedDims(sl1)$PCA[, "PC1"],
                        PC2=reducedDims(sl1)$PCA[, "PC2"],
                        cell=colnames(sl1))
ggplot(pcamarkers, aes(PC1, PC2, color=pseudotime1)) +
  geom_point(size=0.5, alpha=0.8) +
  geom_path(data=curve1, col="black", lwd=1, linetype="solid") +
  ggtitle(paste0("lineage 1: ", paste0(sshot@lineages$Lineage1, collapse="->")))

ggplot(pcamarkers, aes(PC1, PC2, color=pseudotime2)) +
  geom_point(size=0.5, alpha=0.8) +
  geom_path(data=curve2, col="black", lwd=1, linetype="solid") +
  ggtitle(paste0("lineage 2: ", paste0(sshot@lineages$Lineage2, collapse="->")))

3 Gene expression along pseudotime

We can now generate a dataframe where to store gene expression, PC1 and PC2, cell type and pseudotime values along two differentiation trajectories.

# load cached trajectories
if (file.exists(file.path(getwd(), "output", "trajectories_markers.csv"))) {
  trajs <- read.csv(file.path(getwd(), "output", "trajectories_markers.csv"))
  } else {
  # calculate trajectories
  ## collect expression data
  markerdat <- logcounts(sce) %>%
    as.matrix() %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene") %>%
    gather("cell", "logcount", -gene) %>%
    filter(gene %in% c(isc, eb, ec, ee, 'emc', 'sc')) # without this step, the final dataset is gigantic
  ## attach pseudotime values
  trajs <- data.frame(pseudotime1=sl1$slingPseudotime_1,
                      pseudotime2=sl1$slingPseudotime_2,
                      PC1=reducedDims(sl1)$PCA[, "PC1"],
                      PC2=reducedDims(sl1)$PCA[, "PC2"],
                      cell=colnames(sl1)) %>%
    left_join(markerdat, by="cell")
  ## store
  write.csv(trajs, file.path(outpath, "trajectories_markers.csv"))
  rm('sce', 'sl1')
  }

With this, we can check that ISC/EB markers decrease expression as pseudotime increases:

# limit genes to ISC/EB markers 
traj_iscmarkers <- filter(trajs, gene %in% c(isc, eb))
# plot expression
ggplot(traj_iscmarkers,
       aes(pseudotime1, logcount, group=gene, color=gene)) +
  coord_cartesian(ylim=c(0, 3)) +
  scale_x_reverse() +
  scale_y_continuous(position = "right") +
  geom_smooth(se=FALSE) +
  xlab('pseudotime (ISC/EB -> EC)')

ggplot(traj_iscmarkers,
       aes(pseudotime2, logcount, group=gene, color=gene)) +
  coord_cartesian(ylim=c(0, 3)) +
  geom_smooth(se=FALSE) +
  xlab('pseudotime (ISC/EB -> EE)')

These graphs plot logcounts vs pseudotime, but a direct scatter/line representation would generate overplotting and noise, obscuring the pattern. This is why we use the geom_smooth. The smoothing algorithm uses a linear model function from where confidence intervals can be calculated from standard errors - these are not plotted here, for simplicity (see below).

We did the same for enterocyte markers, which show increase in the ISC/EB->EC trajectory, but not in the ISC/EB->EE one:

traj_ecmarkers <- filter(trajs, gene %in% ec)
ggplot(traj_ecmarkers,
       aes(pseudotime1, logcount, group=gene, color=gene)) +
  coord_cartesian(ylim=c(0, 3)) +
  scale_x_reverse() +
  scale_y_continuous(position = "right") +
  geom_smooth(se=FALSE) +
  xlab('pseudotime (ISC/EB -> EC)')

ggplot(traj_ecmarkers,
       aes(pseudotime2, logcount, group=gene, color=gene)) +
  coord_cartesian(ylim=c(0, 3)) +
  geom_smooth(se=FALSE) +
  xlab('pseudotime (ISC/EB -> EE)')

And the same for enteroendocrine cells markers, which also show the expected pattern:

traj_eemarkers <- filter(trajs, gene %in% ee)
ggplot(traj_eemarkers,
       aes(pseudotime1, logcount, group=gene, color=gene)) +
  coord_cartesian(ylim=c(0, 4)) +
  scale_x_reverse() +
  scale_y_continuous(position = "right") +
  geom_smooth(se=FALSE) +
  xlab('pseudotime (ISC/EB -> EC)')

ggplot(traj_eemarkers,
       aes(pseudotime2, logcount, group=gene, color=gene)) +
  coord_cartesian(ylim=c(0, 4)) +
  geom_smooth(se=FALSE) +
  xlab('pseudotime (ISC/EB -> EE)')

This all makes sense, so we can turn now to emc and sc, the genes of interest, and plot them with the 95% confidence interval (in grey).

# emc
ggplot(trajs %>% filter(gene == "emc"), aes(pseudotime1, logcount, colour=gene)) +
  scale_x_reverse() +
  scale_y_continuous(position = "right") +
  geom_smooth() +
  xlab('pseudotime (ISC/EB -> EC)')

ggplot(trajs %>% filter(gene == "emc"), aes(pseudotime2, logcount, colour=gene)) +
  geom_smooth() +
  xlab('pseudotime (ISC/EB -> EE)')

# sc
ggplot(trajs %>% filter(gene == "sc"), aes(pseudotime1, logcount, colour=gene)) +
  scale_x_reverse() +
  scale_y_continuous(position = "right") +
  geom_smooth() +
  xlab('pseudotime (ISC/EB -> EC)')

ggplot(trajs %>% filter(gene == "sc"), aes(pseudotime2, logcount, colour=gene)) +
  geom_smooth() +
  xlab('pseudotime (ISC/EB -> EE)')

This shows that, measured by scRNAseq, the expression of emc increases as differentiation progresses towards EC and decreases toward EE. sc, by contrast, seems highest in undifferentiated cells, and decreases as cells differentiate into any lineage. The most novel aspect of this is the expression of emc, so let us plot this a bit better. The approach is based on this answer in Stack Overflow, and requires a bit of preparation:

# prepare data
bar_int <- trajs %>%
  filter(gene == "emc") %>%
  mutate(pseudotime2neg = pseudotime2 * -1)

# prepare bits and bobs
#https://stackoverflow.com/a/49202967/6731772
## x axis
xaxis_begin  <- -30
xaxis_end    <- 35
nxticks <- 14
xtick_frame <-data.frame(
  ticks = seq(xaxis_begin, xaxis_end, length.out = nxticks),
  zero=0)# %>% subset(ticks != 0)
xframe <- data.frame(lab = seq(xaxis_begin, xaxis_end, length.out = nxticks),
                     zero = 0) %>% subset(lab != 0)
## y axis
yaxis_begin  <- -0.5
yaxis_end    <- 2.5
nyticks <- 7
ytick_frame <-data.frame(
  ticks = seq(yaxis_begin, yaxis_end, length.out = nyticks),
  zero=-0.5)# %>% subset(ticks != zero)
yframe <- data.frame(lab = seq(yaxis_begin, yaxis_end, length.out = nyticks),
                     zero = -0.5)# %>% subset(lab != zero)
## tick sizes
xtick_sz <- (tail(yframe$lab, 1) - yframe$lab[1]) / 128
ytick_sz <- (tail(xframe$lab, 1) - xframe$lab[1]) / 128

Preliminary plot:

# preliminary plot
smoop <- ggplot(bar_int, aes(x=pseudotime1, y=logcount)) +
  geom_smooth(colour = '#CC79A7', fill = '#CC79A799',
              na.rm = TRUE) +
  geom_smooth(aes(x=pseudotime2neg),
              colour = '#009E73', fill = '#009E7399',
              na.rm = TRUE)
smoop

Let us add a central Y-axis and a bi-directional X-axis:

smoop <- smoop +
  # y axis line
  geom_segment(x = -0.1, xend = -0.1, 
               y = yframe$zero[1], yend = tail(yframe$lab, 1),
               linewidth = 0.5,
               colour = 'black') +
  # x axis line
  geom_segment(y = -0.5, yend = -0.5,
               x = xframe$lab[1], xend = tail(xframe$lab, 1),
               linewidth = 0.5,
               lineend = 'butt',
               linejoin = 'mitre',
               colour = 'black',
               arrow = arrow(angle = 20,
                             length = unit(1/(2*nxticks), "npc"),
                             ends = "both",
                             type = "open")
               ) +
  # x ticks
  geom_segment(data = xtick_frame[3:nrow(xtick_frame)-1,], 
               aes(x = ticks, xend = ticks, 
                   y = -0.5, yend = -0.5 + xtick_sz),
                   colour = 'black') +
  # y ticks
  geom_segment(data = ytick_frame, 
               aes(x = zero, xend = zero + ytick_sz, 
                   y = ticks, yend = ticks),
                   colour = 'black') +
  theme_void()
smoop

Now remove unwanted stuff:

smoop <- smoop +
  # labels
  geom_text(data=xframe[3:nrow(xframe)-1,], aes(x=lab, y=-0.5, label=lab),
            family = 'Helvetica', vjust=1.5,
            colour = 'black') +
  geom_text(data=yframe, aes(x=rep(-0.25, nrow(yframe)),
                             y=c(lab[[1]]+0.1, lab[2:nrow(yframe)]),
                             label=lab),
            family = 'Helvetica', hjust=1.5,
            colour = 'black') +
  geom_richtext(x = -15, y = -1,
                label = "pseudotime (SEC)",
                col = "grey40") +
  geom_richtext(x = 17.5, y = -1,
                label = "pseudotime (ABS)",
                col = "grey40") +
  theme(legend.position = 'none')
smoop

suppressMessages(
  ggsave('pseudotime_emc_xpn.pdf', plot = last_plot(), device = 'pdf', path = figdir, dpi = 300)
)